library(tidyverse)
library(rmarkdown)
library(ggrepel)
states <- map_data("state")
state_names <- states %>% group_by(region) %>% summarize(long=mean(long), lat=mean(lat))
acc <- read.csv("https://raw.githubusercontent.com/xdaiISU/ds202materials/master/hwlabs/fars2017/accident.csv", stringsAsFactors = FALSE)
ppl <- read.csv("https://raw.githubusercontent.com/xdaiISU/ds202materials/master/hwlabs/fars2017/person.csv", stringsAsFactors = FALSE)
glc <- readxl::read_xlsx("FRPP_GLC.xlsx")

Preprocessing

# Preprocess GLC
glc$STATE = as.numeric(glc$`State Code`)

glc_states <- glc %>% group_by(STATE) %>% summarize(state_name=first(`State Name`))
acc_with_state_names <- acc %>% left_join(glc_states, by=c('STATE'))

1

acc %>% group_by(DAY_WEEK) %>%
  summarize(number_accidents=n()) %>%
  mutate(weekday=c('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')) %>%
  select(-DAY_WEEK) %>%
  subset(select=c(2,1))
## # A tibble: 7 x 2
##   weekday number_accidents
##   <chr>              <int>
## 1 Sun                 5360
## 2 Mon                 4374
## 3 Tue                 4347
## 4 Wed                 4314
## 5 Thu                 4621
## 6 Fri                 5358
## 7 Sat                 5873

Using the data manual, we know that values of DAY_WEEK 1-7 corresponds to days, Sun-Sat, which we can use to get the weekday label for each integer representation. In the resulting table, we can see that on Fridays, Saturdays, and Sundays, there are generally a significantly greater number of fatal accidents compared to Monday, Tuesday, Wednesday, or Thursday.

The greatest number of fatal accidents occur on Saturdays.

2

fatal <- ppl %>% filter(INJ_SEV==4)
paged_table(fatal)

Here we see a few entries of the dataset created which contains only the people who suffered fatal injuries.

3

# First get the MAKE for each accident
dat <- acc_with_state_names %>% inner_join(ppl, by=c('ST_CASE', 'STATE')) %>% select(STATE, state_name, MAKE)

most_dangerous <- dat %>%
  filter(!is.na(MAKE)) %>%
  group_by(STATE, state_name, MAKE) %>%
  summarize(number_accidents=n()) %>%
  filter(number_accidents==max(number_accidents))
## `summarise()` has grouped output by 'STATE', 'state_name'. You can override using the `.groups` argument.
most_dangerous %>% paged_table

Here we can see the most dangerous make ID for each state ID, as well as the number of fatal accidents for each of those state/make combinations.

4

states$region = toupper(states$region)

avg_state_locations <- states %>% group_by(region) %>% summarize(avg_long=mean(long), avg_lat=mean(lat))

dangerous_with_location <- most_dangerous %>% mutate(region=state_name) %>% inner_join(avg_state_locations, by=c('region'))

ggplot(states, aes(x=long, y=lat)) + geom_polygon(aes(group=group)) +
  geom_text_repel(data=dangerous_with_location, aes(x=avg_long, y=avg_lat, label=MAKE), color='green')

I define the most dangerous vehicle to be the MAKE code which is involved in the highest number of fatal accidents for each state. Here we can see the MAKE code for the most dangerous vehicle by that definition for each state.

5

acc_ppl <- acc %>% inner_join(ppl)
## Joining, by = c("STATE", "ST_CASE", "VE_FORMS", "COUNTY", "DAY", "MONTH", "HOUR", "MINUTE", "RUR_URB", "FUNC_SYS", "HARM_EV", "MAN_COLL", "SCH_BUS")
paged_table(acc_ppl)

Show above are a few entries of the joined dataframe. ST_CASE identifies each accident, so we could join by that variable, but we can also just allow R to match by all matching columns to get an equivalent result.

6

acc %>% group_by(DAY_WEEK) %>%
  summarize(number_accidents=n()) %>%
  mutate(weekday=c('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')) %>%
  select(-DAY_WEEK) %>%
  subset(select=c(2,1)) %>%
  ggplot(aes(x=weekday, weight=number_accidents)) + geom_bar() +
  xlab('Day of Week') + ylab('Number of Accidents')

acc %>% group_by(HOUR) %>%
  filter(HOUR != 99) %>%
  summarize(number_accidents=n()) %>%
  ggplot(aes(x=HOUR, weight=number_accidents)) + geom_bar() +
  xlab('Hour in Day') + ylab('Number of Accidents')

acc_ppl %>% group_by(SEX) %>%
  summarize(number_accidents=n()) %>%
  ggplot(aes(x=SEX, weight=number_accidents)) + geom_bar() +
  xlab('Sex Attribute Code') + ylab('Number of Accidents')

We can see from these summaries that Saturday is the most common day on which fatal accidents occur, fatal accidents are most common around hour 18 (6 PM), and that males (Sex Attribute 1) are about twice as likely to be in an accident that females (Sex Attribute 2).

7

counties <- map_data('county')

deaths_by_county <- acc_ppl %>% 
  filter(INJ_SEV==4) %>% 
  group_by(STATE, COUNTY) %>%
  summarize(total_deaths=n())
## `summarise()` has grouped output by 'STATE'. You can override using the `.groups` argument.
glc_counties <- glc %>%
  mutate(STATE=as.numeric(`State Code`), COUNTY=as.numeric(`County Code`), region=tolower(`State Name`), subregion=tolower(`County Name`)) %>%
  group_by(region, subregion, STATE, COUNTY) %>%
  summarize(region=first(region), subregion=first(subregion), STATE=first(STATE), COUNTY=first(COUNTY))
## `summarise()` has grouped output by 'region', 'subregion', 'STATE'. You can override using the `.groups` argument.
deaths_by_county_with_names <- deaths_by_county %>% left_join(glc_counties, by=c('STATE', 'COUNTY'))

dat <- counties %>% left_join(deaths_by_county_with_names, by=c('region', 'subregion'))

ggplot(dat, aes(x=long, y=lat, fill=total_deaths)) + geom_polygon(aes(group=group))

Above we can see the total deaths due to accidents by county. Grayed out counties had no records in the accident dataset, suggesting that most likely they have had no recorded fatal accidents.

8

summer <- acc_with_state_names %>% filter(MONTH %in% c(6,7,8))
winter <- acc_with_state_names %>% filter(MONTH %in% c(12, 1, 2))

nrow(summer)
## [1] 9205
nrow(winter)
## [1] 7733

We can see that there are more fatal accidents during the summer, suggesting that the summer is more dangerous than the winter in general.

summer1 <- summer %>% group_by(state_name) %>% summarize(num_summer_accidents=n())
winter1 <- winter %>% group_by(state_name) %>% summarize(num_winter_accidents=n())

dat <- inner_join(summer1, winter1)
## Joining, by = "state_name"
dat %>% filter(num_winter_accidents > num_summer_accidents) %>% paged_table

We can see that this does, however, depend on the state. The table above shows exactly which states for which there are more fatal accidents in the winter than in the summer.